In [ ]:
# 📊 Netflix Data Analysis & Recommendation System
### 🚀 Business Problem
Netflix wants to understand content trends and improve recommendations.
This project analyzes **movies vs. TV shows trends, genres, ratings, and country-wise distribution.**
It also builds a **content-based recommendation system** using machine learning.
In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
In [7]:
# Load the Netflix dataset
df = pd.read_csv("/Users/chiragmudgal/Netflix EDA/netflix_titles.csv", encoding="ISO-8859-1")
# Create a copy for cleaning
df_cleaned = df.copy()
# Fill missing values
df_cleaned.fillna("Unknown", inplace=True)
for col in df_cleaned.columns:
if df_cleaned[col].dtype == "object":
df_cleaned.loc[:, col] = df_cleaned[col].fillna("Unknown") # Fix for categorical columns
elif df_cleaned[col].dtype in ["int64", "float64"]:
df_cleaned.loc[:, col] = df_cleaned[col].fillna(df_cleaned[col].median()) # Fix for numeric columns
elif df_cleaned[col].dtype == "datetime64[ns]":
df_cleaned.loc[:, col] = df_cleaned[col].fillna(pd.NaT) # Fix for datetime columns
# Display dataset information
df.info()
df.head()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8809 entries, 0 to 8808 Data columns (total 26 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 show_id 8809 non-null object 1 type 8809 non-null object 2 title 8809 non-null object 3 director 6175 non-null object 4 cast 7984 non-null object 5 country 7978 non-null object 6 date_added 8799 non-null object 7 release_year 8809 non-null int64 8 rating 8805 non-null object 9 duration 8806 non-null object 10 listed_in 8809 non-null object 11 description 8809 non-null object 12 Unnamed: 12 0 non-null float64 13 Unnamed: 13 0 non-null float64 14 Unnamed: 14 0 non-null float64 15 Unnamed: 15 0 non-null float64 16 Unnamed: 16 0 non-null float64 17 Unnamed: 17 0 non-null float64 18 Unnamed: 18 0 non-null float64 19 Unnamed: 19 0 non-null float64 20 Unnamed: 20 0 non-null float64 21 Unnamed: 21 0 non-null float64 22 Unnamed: 22 0 non-null float64 23 Unnamed: 23 0 non-null float64 24 Unnamed: 24 0 non-null float64 25 Unnamed: 25 0 non-null float64 dtypes: float64(14), int64(1), object(11) memory usage: 1.7+ MB
/var/folders/j5/y4vsdpc54ns14r55p8cxl_900000gn/T/ipykernel_14046/2197598728.py:9: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'Unknown' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
df_cleaned.fillna("Unknown", inplace=True)
Out[7]:
| show_id | type | title | director | cast | country | date_added | release_year | rating | duration | ... | Unnamed: 16 | Unnamed: 17 | Unnamed: 18 | Unnamed: 19 | Unnamed: 20 | Unnamed: 21 | Unnamed: 22 | Unnamed: 23 | Unnamed: 24 | Unnamed: 25 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | s1 | Movie | Dick Johnson Is Dead | Kirsten Johnson | NaN | United States | September 25, 2021 | 2020 | PG-13 | 90 min | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | s2 | TV Show | Blood & Water | NaN | Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban... | South Africa | September 24, 2021 | 2021 | TV-MA | 2 Seasons | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | s3 | TV Show | Ganglands | Julien Leclercq | Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi... | NaN | September 24, 2021 | 2021 | TV-MA | 1 Season | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | s4 | TV Show | Jailbirds New Orleans | NaN | NaN | NaN | September 24, 2021 | 2021 | TV-MA | 1 Season | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | s5 | TV Show | Kota Factory | NaN | Mayur More, Jitendra Kumar, Ranjan Raj, Alam K... | India | September 24, 2021 | 2021 | TV-MA | 2 Seasons | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 26 columns
In [6]:
## 🛠 Data Cleaning & Preprocessing
#We handle missing values, convert date formats, and prepare features for analysis.
# Now you can check for missing values
print(df_cleaned.isnull().sum()) # Shows count of missing values per column
print(df_cleaned.dtypes) # Shows data types of each column
show_id 0 type 0 title 0 director 0 cast 0 country 0 date_added 0 release_year 0 rating 0 duration 0 listed_in 0 description 0 Unnamed: 12 0 Unnamed: 13 0 Unnamed: 14 0 Unnamed: 15 0 Unnamed: 16 0 Unnamed: 17 0 Unnamed: 18 0 Unnamed: 19 0 Unnamed: 20 0 Unnamed: 21 0 Unnamed: 22 0 Unnamed: 23 0 Unnamed: 24 0 Unnamed: 25 0 dtype: int64 show_id object type object title object director object cast object country object date_added object release_year int64 rating object duration object listed_in object description object Unnamed: 12 object Unnamed: 13 object Unnamed: 14 object Unnamed: 15 object Unnamed: 16 object Unnamed: 17 object Unnamed: 18 object Unnamed: 19 object Unnamed: 20 object Unnamed: 21 object Unnamed: 22 object Unnamed: 23 object Unnamed: 24 object Unnamed: 25 object dtype: object
In [8]:
## 📊 Exploratory Data Analysis (EDA)
## We visualize content distribution, trends, and genre popularity.
# Interactive Bar Chart: Movies vs. TV Shows
df_type_counts = df_cleaned['type'].value_counts().reset_index()
df_type_counts.columns = ['type', 'count'] # Rename columns for clarity
fig = px.bar(df_type_counts,
x='type', y='count',
color='type',
title='Distribution of Movies and TV Shows on Netflix')
fig.show()
In [9]:
# Interactive Line Chart: Content Added Over Years
content_per_year = df_cleaned.groupby('release_year').size().reset_index()
content_per_year.columns = ['release_year', 'count'] # Rename columns
fig = px.line(content_per_year, x='release_year', y='count',
title='Netflix Content Released Over the Years', markers=True)
fig.show()
In [10]:
#Genre Distribution Plot
genre_counts = df_cleaned["listed_in"].str.split(", ").explode().value_counts().reset_index()
genre_counts.columns = ['genre', 'count'] # Rename columns
fig = px.bar(genre_counts.head(10), x='genre', y='count', title='Top 10 Most Common Genres')
fig.show()
In [11]:
## 📢 Final Insights & Business Recommendations
# **TV Shows have grown significantly since 2015.**
# **Mature-rated content (TV-MA) dominates Netflix’s catalog.**
# **Recommendation:** Invest in binge-worthy series and niche genres (e.g., K-Dramas).
In [ ]: